In this document we present the joint analysis of the PASS1A metabolomics datasets.
Load the data from the cloud, including: phenotypic data, metabolomic datasets, and metabolomics dictionary.
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/supervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/gcp_functions.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/association_analysis_methods.R")
source("~/Desktop/repos/motrpac-bic-norm-qc/tools/data_aux_functions.R")
source("~/Desktop/repos/motrpac/tools/prediction_ml_tools.R")
library(randomForest) # for classification tests
# Load the dmaqc data
merged_dmaqc_data = load_from_bucket("merged_dmaqc_data2019-10-15.RData",
"gs://bic_data_analysis/pass1a/pheno_dmaqc/",F)
merged_dmaqc_data = merged_dmaqc_data[[1]]
rownames(merged_dmaqc_data) = as.character(merged_dmaqc_data$vial_label)
# define the tissue variable
merged_dmaqc_data$tissue = merged_dmaqc_data$sampletypedescription
# define the time to freeze variable
merged_dmaqc_data$time_to_freeze = merged_dmaqc_data$calculated.variables.time_death_to_collect_min +
merged_dmaqc_data$calculated.variables.time_collect_to_freeze_min
# col time vs. control
# df = data.frame(
# bid = merged_dmaqc_data$bid,
# edta_col_time = merged_dmaqc_data$calculated.variables.edta_coll_time,
# time_to_freeze = merged_dmaqc_data$time_to_freeze,
# is_control = merged_dmaqc_data$animal.key.is_control,
# tp = merged_dmaqc_data$animal.key.timepoint,
# tissue = merged_dmaqc_data$specimen.processing.sampletypedescription
# )
# df = unique(df)
# boxplot(edta_col_time/3600 ~ is_control,df)
# boxplot(edta_col_time/3600 - tp ~ is_control,df)
# wilcox.test(edta_col_time/3600 ~ is_control,df)
# blood freeze times
blood_samples =
merged_dmaqc_data$specimen.processing.sampletypedescription ==
"EDTA Plasma"
blood_freeze_time =
as.difftime(merged_dmaqc_data$specimen.processing.t_freeze,units = "mins") -
as.difftime(merged_dmaqc_data$specimen.processing.t_edtaspin,units="mins")
blood_freeze_time = as.numeric(blood_freeze_time)
time_to_freeze = merged_dmaqc_data$time_to_freeze[blood_samples] =
blood_freeze_time[blood_samples]
# Load our parsed metabolomics datasets
metabolomics_parsed_datasets = load_from_bucket(
file = "metabolomics_parsed_datasets_pass1a_external1.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")[[1]]
# # # Read the dictionary
# dict_bucket =
# "gs://motrpac-external-release1-results/metabolomics_targeted/motrpac_metabolomics_data_dictionary-v1.1.5.txt"
# dict_download = get_single_file_from_bucket_to_local_dir(dict_bucket)
# metabolomics_dict = fread(dict_download[[1]],data.table = F)
# metabolomics_dict[grepl("gluc",metabolomics_dict[,2],ignore.case = T),]
Define the variables to be adjusted for:
biospec_cols = c(
"acute.test.distance",
"calculated.variables.time_to_freeze",
# "calculated.variables.edta_coll_time", # no need - see code above for blood
"bid" # required for matching datasets
)
differential_analysis_cols = c(
"animal.registration.sex",
"animal.key.timepoint",
"animal.key.is_control"
)
pipeline_qc_cols = c("sample_order")
Some sites do not use the log transformation on their dataset. In this section we plot the coefficient of variation as a function of the mean instensity. We take a single dataset as an example to show how log-transformed data have reduced dependency and smoother plots.
As an additional analysis we also plot the number of missing values per metabolite as a function of its mean intensity. We show that while there is high correlation some missing values appear in fairely high intensities. This is important for imputation as some sites use some fixed low value instead of knn imputation.
# Plot cv vs means
library(gplots)
d = metabolomics_parsed_datasets[["white_adipose_powder,metab_u_hilicpos,unnamed"]]
dx = d$sample_data
CoV<-function(x){return(sd(x,na.rm = T)/mean(x,na.rm=T))}
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Raw data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
# Repeat after log2
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
CoVs = apply(dx,1,CoV)
inds = !is.na(CoVs)
df = data.frame(Mean_intensity = dmeans[inds],CoV = CoVs[inds])
plot(CoV~Mean_intensity,df,cex=0.5,pch=20,main="Log2 data")
lines(lowess(CoV~Mean_intensity,df),lty=2,lwd=2,col="blue")
# Plot number of NAs vs intensity mean
dx = log(1+d$sample_data,base=2)
dmeans = apply(dx,1,mean,na.rm=T)
num_nas = rowSums(is.na(dx))
df = data.frame(Num_NAs = num_nas[inds],Mean_intensity = dmeans[inds])
rho = cor(df$Num_NAs,df$Mean_intensity,method="spearman")
rho = format(rho,digits=2)
plot(Num_NAs~Mean_intensity,df,cex=0.5,pch=20,
main=paste("Spearman:",rho))
We go over each dataset and merge the named and unnamed subparts of the untargeted datasets. For targeted data, we merge datasets that are from the same site and tissue.
Then the preprocessing of the new merged datasets is as follows: (1) log the data (values are raw intensities), (2) remove rows with \(>20\%\) missing values, and (3) impute missing values using knn (k=10).
Important: datasets that do not have unique metabolite names or sample ids probably had errors while parsing and are therefore ignored. Also, untargeted with failed mergning of the unnamed and named subsets (e.g., due to different sample ids) are ignored as well.
Finally we add an additional version for each dataset by directly regressing out the sample order component. See https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4757603/ for more details (trend correction is also called background correction, and this is done for each batch separately).
metabolomics_processed_datasets = c()
# set a binary vector indicating with entries to skip
metabolomics_raw_datasets_skip = rep(F,length(metabolomics_parsed_datasets))
names(metabolomics_raw_datasets_skip) = names(metabolomics_parsed_datasets)
untargeted_merge_errors = list()
for(currname in names(metabolomics_parsed_datasets)){
if(metabolomics_raw_datasets_skip[currname]){next}
arr = strsplit(currname,split=",")[[1]]
arr = arr[-length(arr)]
name1 = paste(paste(arr,collapse=","),"named",sep=",")
name2 = paste(paste(arr,collapse=","),"unnamed",sep=",")
# If dataset is untargeted, merge named and unnamed, update the dataset
# currname. Skip the dataset if the merge fails, but store and print the errror.
# If the dataset is targeted - go over all datasets with the same vial labels and
# merge if possible.
if(name2 %in% names(metabolomics_parsed_datasets)){
d1 = metabolomics_parsed_datasets[[name1]]
d2 = metabolomics_parsed_datasets[[name2]]
newd = NULL
tryCatch({
newd = merge_named_and_unnamed_metabolomics_datasets(d1,d2,strict = F)
}, error = function(e) {}) # can add error handling here
currname = paste(paste(arr,collapse=","),"untargeted",sep=",")
metabolomics_raw_datasets_skip[name1] = T
metabolomics_raw_datasets_skip[name2] = T
if(is.null(newd)){
print(paste("# Skipping dataset",
currname,
"because merging the named and unnamed subsets failed"))
next
}
}
else{
metabolomics_raw_datasets_skip[currname] = T
newd = metabolomics_parsed_datasets[[currname]]
curr_vials = colnames(newd$sample_data)
curr_site = tolower(unique(newd$sample_meta$site))
for(another_dataset in names(metabolomics_raw_datasets_skip)){
if(metabolomics_raw_datasets_skip[another_dataset]){next}
another_d = metabolomics_parsed_datasets[[another_dataset]]
another_vials = colnames(another_d$sample_data)
another_d_site = tolower(unique(another_d$sample_meta$site))
another_shared_vials = intersect(another_vials,curr_vials)
if(curr_site == another_d_site &&
length(another_shared_vials)==length(another_vials) &&
length(another_shared_vials)==length(curr_vials)
){
# examine the metadata
curr_meta = newd$sample_meta
another_meta = another_d$sample_meta
# at this point we know that both datasets have the
# same samples, but not necessarily the same controls
# these has been partially submitted and we therefore need to
# intersect here
shared_meta_samples = intersect(rownames(curr_meta),rownames(another_meta))
# make sure that the vials, types, and order are the same
if(length(shared_meta_samples) < 1 ||
!all(curr_meta[shared_meta_samples,1:3]==
another_meta[shared_meta_samples,1:3])){
next # continue if cannot merge
}
try({
newd = merge_named_and_unnamed_metabolomics_datasets(
newd,another_d,strict = F) # no strict merge thanks to test above
metabolomics_raw_datasets_skip[another_dataset] = T
})
}
}
# update the name of the dataset
# we keep track of the number of datasets from the given site, as some
# merges may fail
currname = currname = paste(arr[1],curr_site,sep=",")
num_currname = sum(grepl(currname,names(metabolomics_processed_datasets)))
currname = paste(currname,num_currname+1,sep=",")
}
print(paste("#### Analyzing data from:",currname))
curr_data = newd$sample_data
curr_data2 = recast_numeric_data_frame(curr_data)
curr_data_mat = as.matrix(curr_data2)
print(paste("no errors in parsing numeric values:",
all(curr_data==curr_data_mat,na.rm=T)))
# floor the data at 0
curr_data_mat[curr_data_mat<0] = 0
# Keep a logged version of the data
curr_data_log = log(curr_data_mat+1,base=2)
# organize the metadata
curr_meta = merged_dmaqc_data[colnames(curr_data),
union(biospec_cols,differential_analysis_cols)]
# remove metadata variables with too many NAs
na_counts = apply(is.na(curr_meta),2,sum)
curr_meta = curr_meta[,na_counts/nrow(curr_meta) < 0.1]
# Look at the sample order
curr_order = newd$sample_meta[rownames(curr_meta),"sample_order"]
# organize the dataset
curr_data = curr_data[,rownames(curr_meta)]
curr_data_log = curr_data_log[,rownames(curr_meta)]
# remove zero variance rows or rows with many NAs
rows_to_rem = !(apply(curr_data,1,sd,na.rm=T)>0)
percent_na = rowSums(is.na(curr_data)) / ncol(curr_data)
print(paste("number of NA cells:",sum(is.na(curr_data))))
rows_to_rem = rows_to_rem | percent_na > 0.2
# remove rows with no annotation
rows_to_rem = rows_to_rem | newd$data_raw_rownames == ""
rows_to_rem = rows_to_rem | newd$data_raw_rownames == "-"
rows_to_rem = rows_to_rem | newd$data_raw_rownames == "_"
rows_to_rem = rows_to_rem | is.na(newd$data_raw_rownames)
# remove rows with duplicated representation
row_duplications = names(which(table(newd$data_raw_rownames)>1))
if(any(newd$data_raw_rownames[!rows_to_rem] %in% row_duplications)){
rows_to_rem = rows_to_rem | newd$data_raw_rownames %in% row_duplications
print("# WARNING: dataset has duplicated row names whose data are now ignored")
print(paste(row_duplications,collapse=","))
}
curr_data = curr_data[!rows_to_rem,]
curr_data_log = curr_data_log[!rows_to_rem,]
# impute - use capture.output to avoid prints
capture.output(
{curr_data_imp = impute::impute.knn(as.matrix(curr_data_log))$data}
,file=NULL)
curr_data_rnames = newd$data_raw_rownames[!rows_to_rem]
rownames(curr_data_imp) = curr_data_rnames
# Regress out the sample order
# lm_regress_matrix works on columns
curr_data_imp2 = t(lm_regress_out_matrix(t(curr_data_imp),curr_order))
# update the data object
metabolomics_processed_datasets[[currname]] = newd
metabolomics_processed_datasets[[currname]]$sample_data = curr_data_imp
metabolomics_processed_datasets[[currname]]$sample_data_nolog_noimp = curr_data
metabolomics_processed_datasets[[currname]]$data_raw_rownames = curr_data_rnames
metabolomics_processed_datasets[[currname]]$sample_meta_parsed = curr_meta
metabolomics_processed_datasets[[currname]]$sample_data_order_adj = curr_data_imp2
}
save_to_bucket(metabolomics_processed_datasets,
file="metabolomics_processed_datasets10282019.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/")
Alternatively load the result from the bucket to save time:
metabolomics_processed_datasets = load_from_bucket(
file="metabolomics_processed_datasets10282019.RData",
bucket = "gs://bic_data_analysis/pass1a/metabolomics/"
)[[1]]
Untargeted data are typically log-transformed and analyzed using linear models. On the other hand, concentration data are sometimes analyzed with the same type of models but using the original data. This raises a problem if we wish to compare exact statistics from these data. In this section we perform residual analysis for single metabolites. Our goal is to identify if concentration data behaves “normally” when not log-transformed. The analysis below examines the residuals of the data after fitting linear models for each metabolite, adjusting for freeze time and sex. We then compare the results with and without the log-transformation, counting the number of metabolites with a significant evidence for non-normally distributed residuals.
# check for normality using the Kolmogorov-Smirnov test
is_normal_test<-function(v,samp=10000){
return(ks.test(v,"pnorm",mean(v,na.rm=T),sd(v,na.rm = T))$p.value)
}
# go over the named datasets, get a logged and an unlogged version of
# the data, use these as inputs for the regression
residual_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
if(grepl("untargeted",nn1)){next}
x_log = metabolomics_processed_datasets[[nn1]]$sample_data
x_unlog = 2^x_log
# take the covariates, ignore distances
x_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
curr_covs = x_meta[,intersect(colnames(x_meta),biospec_cols[2])]
curr_covs = data.frame(curr_covs,
sex=x_meta$animal.registration.sex)
# get the lm objects
curr_models = list()
for(tp in unique(x_meta$animal.key.timepoint)){
res_log = apply(
x_log,1,
pass1a_simple_differential_abundance,
tps = x_meta$animal.key.timepoint,tp=tp,
is_control = x_meta$animal.key.is_control,
covs = curr_covs,return_model=T
)
res_unlog = apply(
x_unlog,1,
pass1a_simple_differential_abundance,
tps = x_meta$animal.key.timepoint,tp=tp,
is_control = x_meta$animal.key.is_control,
covs = curr_covs,return_model=T
)
is_norm = cbind(
sapply(res_log,function(x)is_normal_test(residuals(x))),
sapply(res_unlog,function(x)is_normal_test(residuals(x)))
)
colnames(is_norm) = c("log","not log")
curr_models[[as.character(tp)]] = is_norm
}
residual_analysis_results[[nn1]] = curr_models
}
# Is there a significant difference between the two options?
log_vs_unlog_summ_mat = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)
wilcox.test(y[,1],y[,2],paired = T,alternative = "g")$p.value))
# Count the number of non-normal metabolites
num_nonnormal_log = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)sum(y[,1]<0.05)))
num_nonnormal_log =
num_nonnormal_log[,order(colnames(num_nonnormal_log))]
num_nonnormal_unlog = sapply(residual_analysis_results,
function(x)sapply(x,
function(y)sum(y[,2]<0.05)))
num_nonnormal_unlog =
num_nonnormal_unlog[,order(colnames(num_nonnormal_unlog))]
library(corrplot)
par(mar = c(5,5,5,10))
corrplot(t(num_nonnormal_log)- t(num_nonnormal_unlog),
is.corr = F,tl.cex = 0.7)
Compare overlaps, effect sizes, and correlations within tissues. Compare targeted-untargeted pairs only. For differential analysis we use the same model as in the analysis above.
# helper function to transform a metabolomics matrix
# to that of its motrpac compound ids
extract_metab_data_from_row_annot<-function(x,row_annot_x){
# get the coloumn that has the row names
int_sizes = apply(row_annot_x,2,function(x,y)length(intersect(x,y)),y=rownames(x))
ind = which(int_sizes==max(int_sizes,na.rm = T))[1]
row_annot_x = row_annot_x[is.element(row_annot_x[,ind],set=rownames(x)),]
rownames(row_annot_x) = row_annot_x[,ind]
shared = intersect(rownames(row_annot_x),rownames(x))
x = x[shared,]
row_annot_x = row_annot_x[shared,]
rownames(x) = row_annot_x$motrpac_comp_name
return(x)
}
single_metabolite_corrs = list()
single_metabolite_de = c()
named2covered_shared_metabolites = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
if(grepl("untargeted",nn1)){next}
single_metabolite_corrs[[nn1]] = list()
named2covered_shared_metabolites[[nn1]] = NULL
for(nn2 in names(metabolomics_processed_datasets)){
if(nn2 == nn1){next}
if(!grepl("untargeted",nn2)){next}
nn2_tissue = strsplit(nn2,split=",")[[1]][1]
nn2_tissue = gsub("_powder","",nn2_tissue)
nn2_dataset = strsplit(nn2,split=",")[[1]][2]
if(nn1_tissue!=nn2_tissue){next}
print(nn2)
# get the numeric datasets and their annotation
x = metabolomics_processed_datasets[[nn1]]$sample_data
y = metabolomics_processed_datasets[[nn2]]$sample_data
row_annot_x = metabolomics_processed_datasets[[nn1]]$row_annot
row_annot_y = metabolomics_processed_datasets[[nn2]]$row_annot
# transform metabolite names to the motrpac comp name
x = extract_metab_data_from_row_annot(x,row_annot_x)
y = extract_metab_data_from_row_annot(y,row_annot_y)
# align the sample sets
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# step 1: merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
# step 2: use the shared bio ids
shared_bids = as.character(intersect(colnames(y),colnames(x)))
x = as.matrix(x[,shared_bids])
y = as.matrix(y[,shared_bids])
# At this point x and y are over the same BIDs, now we add the metadata
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
rownames(y_meta) = y_meta$bid
y_meta = y_meta[shared_bids,]
# get the shared matebolites
shared_metabolites = intersect(rownames(x),rownames(y))
shared_metabolites = na.omit(shared_metabolites)
if(length(shared_metabolites)==0){next}
named2covered_shared_metabolites[[nn1]] = union(
named2covered_shared_metabolites[[nn1]],
shared_metabolites
)
# Compute the correlation matrices of the shared metabolites
if(length(shared_metabolites)>1){
corrs =cor(t(x[shared_metabolites,]),
t(y[shared_metabolites,]),method = "spearman")
}
else{
corrs = cor(x[shared_metabolites,],
y[shared_metabolites,],method = "spearman")
}
# take the covariates (ignore distances)
curr_cov_cols = intersect(colnames(y_meta),biospec_cols[2])
curr_covs = data.frame(y_meta[,curr_cov_cols])
names(curr_covs) = curr_cov_cols
curr_covs$sex = y_meta$animal.registration.sex # add sex
# differential analysis
for(tp in unique(y_meta$animal.key.timepoint)){
resx = t(apply(
matrix(x[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F
))
resy = t(apply(
matrix(y[shared_metabolites,],nrow=length(shared_metabolites)),1,
pass1a_simple_differential_abundance,
tps = y_meta$animal.key.timepoint,tp=tp,
is_control = y_meta$animal.key.is_control,
covs = curr_covs,return_model=F
))
# Add dataset information, time point, tissue
# These are important annotations for our summary matrix
# called single_metabolite_de below
added_columns = matrix(cbind(
rep(nn1,length(shared_metabolites)),
rep(nn2,length(shared_metabolites)),
shared_metabolites,
rep(tp,length(shared_metabolites)),
rep(nn1_tissue,length(shared_metabolites))
),nrow=length(shared_metabolites))
resx = cbind(resx,rep(T,nrow(resx)))
colnames(resx)[ncol(resx)] = "is_targeted"
resy = cbind(resy,rep(F,nrow(resy)))
colnames(resy)[ncol(resy)] = "is_targeted"
if(nrow(resx)>1){
resx = cbind(added_columns[,-2],resx)
resy = cbind(added_columns[,-1],resy)
}
else{
resx = c(added_columns[,-2],resx)
resy = c(added_columns[,-1],resy)
}
single_metabolite_de = rbind(single_metabolite_de,resx)
single_metabolite_de = rbind(single_metabolite_de,resy)
}
single_metabolite_corrs[[nn1]][[nn2]] = corrs
}
}
## [1] "plasma,metab_u_hilicpos,untargeted"
## [1] "plasma,metab_u_ionpneg,untargeted"
## [1] "plasma,metab_u_lrpneg,untargeted"
## [1] "plasma,metab_u_lrppos,untargeted"
## [1] "plasma,metab_u_rpneg,untargeted"
## [1] "plasma,metab_u_rppos,untargeted"
## [1] "plasma,metab_u_hilicpos,untargeted"
## [1] "plasma,metab_u_ionpneg,untargeted"
## [1] "plasma,metab_u_lrpneg,untargeted"
## [1] "plasma,metab_u_lrppos,untargeted"
## [1] "plasma,metab_u_rpneg,untargeted"
## [1] "plasma,metab_u_rppos,untargeted"
## [1] "plasma,metab_u_hilicpos,untargeted"
## [1] "plasma,metab_u_ionpneg,untargeted"
## [1] "plasma,metab_u_lrpneg,untargeted"
## [1] "plasma,metab_u_lrppos,untargeted"
## [1] "plasma,metab_u_rpneg,untargeted"
## [1] "plasma,metab_u_rppos,untargeted"
## [1] "plasma,metab_u_hilicpos,untargeted"
## [1] "plasma,metab_u_ionpneg,untargeted"
## [1] "plasma,metab_u_lrpneg,untargeted"
## [1] "plasma,metab_u_lrppos,untargeted"
## [1] "plasma,metab_u_rpneg,untargeted"
## [1] "plasma,metab_u_rppos,untargeted"
## [1] "plasma,metab_u_hilicpos,untargeted"
## [1] "plasma,metab_u_ionpneg,untargeted"
## [1] "plasma,metab_u_lrpneg,untargeted"
## [1] "plasma,metab_u_lrppos,untargeted"
## [1] "plasma,metab_u_rpneg,untargeted"
## [1] "plasma,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_hilicpos,untargeted"
## [1] "gastrocnemius_powder,metab_u_ionpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_lrppos,untargeted"
## [1] "gastrocnemius_powder,metab_u_rpneg,untargeted"
## [1] "gastrocnemius_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "liver_powder,metab_u_hilicpos,untargeted"
## [1] "liver_powder,metab_u_ionpneg,untargeted"
## [1] "liver_powder,metab_u_lrpneg,untargeted"
## [1] "liver_powder,metab_u_lrppos,untargeted"
## [1] "liver_powder,metab_u_rpneg,untargeted"
## [1] "liver_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
## [1] "white_adipose_powder,metab_u_hilicpos,untargeted"
## [1] "white_adipose_powder,metab_u_lrpneg,untargeted"
## [1] "white_adipose_powder,metab_u_lrppos,untargeted"
## [1] "white_adipose_powder,metab_u_rpneg,untargeted"
## [1] "white_adipose_powder,metab_u_rppos,untargeted"
# Reformat the results for easier comparison later
single_metabolite_de = data.frame(single_metabolite_de)
names(single_metabolite_de) = c(
"dataset","metabolite","tp","tissue",
"Est","Std","Tstat","Pvalue","is_targeted")
for(col in names(single_metabolite_de)[-c(1:4)]){
single_metabolite_de[[col]] = as.numeric(
as.character(single_metabolite_de[[col]]))
}
for(col in names(single_metabolite_de)[1:4]){
single_metabolite_de[[col]] =
as.character(single_metabolite_de[[col]])
}
# Remove duplications
rownames(single_metabolite_de) = NULL
for(nn in names(single_metabolite_de)){
ndig = 5
if(grepl("pval",nn,ignore.case = T)){
ndig = 10
}
if(is.numeric(single_metabolite_de[[nn]])){
single_metabolite_de[[nn]] = round(single_metabolite_de[[nn]],digits = ndig)
}
}
single_metabolite_de = unique(single_metabolite_de)
We next transform the data above into tables that contain data for each combination of metabolite, time point, and tissue. These are then used for different meta-analyses: (1) a simple random effects analysis, (2) random effects with a binary covariate indicating if a dataset is targeted or untargeted, (3) redo the RE model of (1) with the targeted data only, and (4) redo the RE model of (1) with the untargeted data only.
library(metafor)
meta_analysis_stats = list()
for(tissue in unique(single_metabolite_de$tissue)){
for(tp in unique(single_metabolite_de$tp)){
curr_subset = single_metabolite_de[
single_metabolite_de$tissue==tissue &
single_metabolite_de$tp==tp,]
for(metabolite in unique(curr_subset$metabolite)){
curr_met_data = curr_subset[
curr_subset$metabolite==metabolite,]
curr_met_data$var = curr_met_data$Std^2
re_model1 = NULL;re_model2=NULL
re_model_tar = NULL;re_model_untar = NULL
try({re_model1 = rma.uni(curr_met_data$Est,curr_met_data$var)})
try({re_model2 = rma.mv(curr_met_data$Est,curr_met_data$var,
mods=curr_met_data$is_targeted)})
try({re_model_tar = rma.uni(
curr_met_data[curr_met_data$is_targeted==1,"Est"],
curr_met_data[curr_met_data$is_targeted==1,"var"]
)})
try({re_model_untar = rma.uni(
curr_met_data[curr_met_data$is_targeted==0,"Est"],
curr_met_data[curr_met_data$is_targeted==0,"var"]
)})
meta_analysis_stats[[paste(metabolite,tissue,tp,sep=",")]] =
list(curr_met_data=curr_met_data,re_model1=re_model1,
re_model2 = re_model2,re_model_tar=re_model_tar,
re_model_untar = re_model_untar)
}
}
}
## Error in rma.uni(curr_met_data$Est, curr_met_data$var) :
## Fisher scoring algorithm did not converge. See 'help(rma)' for possible remedies.
We now show some plots to summarize the comparison.
We first plot the number and percentage of metabolites in the targeted datasets that are measured in at least one untargeted dataset.
library(ggplot2)
dataset2num_metabolites = sapply(metabolomics_processed_datasets,
function(x)nrow(x$sample_data))
named_dataset_coverage = sapply(named2covered_shared_metabolites,length)
named_dataset_coverage = data.frame(
name = names(named_dataset_coverage),
percentage = named_dataset_coverage /
dataset2num_metabolites[names(named_dataset_coverage)],
count = named_dataset_coverage,
total = dataset2num_metabolites[names(named_dataset_coverage)]
)
# add datasets with no coverage
all_targeted_datasets = names(metabolomics_processed_datasets)
all_targeted_datasets = all_targeted_datasets[!grepl("untarg",all_targeted_datasets)]
zero_coverage_datasets = setdiff(all_targeted_datasets,
named_dataset_coverage$name)
zero_coverage_datasets = data.frame(
name = zero_coverage_datasets,
percentage = rep(0,length(zero_coverage_datasets)),
count = rep(0,length(zero_coverage_datasets)),
total = dataset2num_metabolites[zero_coverage_datasets]
)
named_dataset_coverage = rbind(named_dataset_coverage,
zero_coverage_datasets)
named_dataset_coverage =
named_dataset_coverage[order(as.character(named_dataset_coverage$name)),]
print(ggplot(named_dataset_coverage, aes(x=name, y=percentage)) +
geom_bar(stat = "identity",width=0.2) + coord_flip() +
geom_text(data=named_dataset_coverage,
aes(name, percentage+0.05, label=count),
position = position_dodge(width=0.9),
size=4) +
ggtitle("Targeted dataset: coverage by untargeted"))
We examine the average absolute correlation between the platforms (within tissues). Whenever two platforms share more than a single metabolite we plot both the average correlation between the same metabolites and between other metabolites. Adding the average correlation between platforms but with different metabolites is important as it gives some perspective to what a significant correlation is. That is, in many cases below, the average correlation may be greater than expected.
# Next examine the Spearman correlations between platforms
mean_abs<-function(x,...){return(mean(abs(x),...))}
sd_abs<-function(x,...){return(sd(abs(x),...))}
extract_diag_vs_non_diag<-function(corrs,func=mean_abs,...){
if(length(corrs)==1){
return(c(same=func(corrs,...),other=NA))
}
same = func(diag(corrs),...)
other = func(
c(corrs[lower.tri(corrs,diag = F)]),...)
return(c(same=same,other=other))
}
for(tar_dataset in names(single_metabolite_corrs)){
l = single_metabolite_corrs[[tar_dataset]]
if(length(l)==0){next}
corr_info = as.data.frame(t(sapply(l, extract_diag_vs_non_diag)))
corr_sd = as.data.frame(t(sapply(l, extract_diag_vs_non_diag,func=sd_abs)))
rownames(corr_info) = sapply(rownames(corr_info),
function(x)strsplit(x,split=",")[[1]][2])
rownames(corr_info) = gsub("metab_u_","",rownames(corr_info))
rownames(corr_sd) = rownames(corr_info)
corr_info$dataset = rownames(corr_info)
corr_sd$dataset = corr_info$dataset
corr_info = melt(corr_info)
corr_sd = melt(corr_sd)
corr_info$sd = corr_sd$value
print(
ggplot(corr_info, aes(x=dataset, y=value, fill=variable)) +
geom_bar(position=position_dodge(), stat="identity", colour='black') +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd),na.rm=T,
width=.2,position=position_dodge(.9)) +
ggtitle(tar_dataset) + xlab("Untargeted dataset") + ylab("Spearman") +
labs(fill = "Pair type") +
theme(legend.position="top",legend.direction = "horizontal")
)
}
Here we go into the single metabolites comparison in greater detail (again, within tissues).
We start with a few examples. Here are the results for lactate in plasma.
lact_res = meta_analysis_stats[
grepl("lact",names(meta_analysis_stats),ignore.case = T) &
grepl("plasma",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
forest(meta_analysis_stats[[lact_example]]$re_model1,
slab = meta_analysis_stats[[lact_example]][[1]][,1],
main = lact_example,xlab = "Log fc",
col = "blue",cex = 1.1)
}
We can now check the same analysis for liver:
lact_res = meta_analysis_stats[
grepl("lact",names(meta_analysis_stats),ignore.case = T) &
grepl("liver",names(meta_analysis_stats),ignore.case = T)
]
lact_res_hours = sapply(names(lact_res),
function(x)as.numeric(strsplit(x,split=",")[[1]][3]))
lact_res = lact_res[order(lact_res_hours)]
for(lact_example in names(lact_res)[1:6]){
forest(meta_analysis_stats[[lact_example]]$re_model1,
slab = meta_analysis_stats[[lact_example]][[1]][,1],
main = lact_example,xlab = "Log fc",
col = "blue",cex = 1.1)
}
We now move to a systematic comparison of the datasets. We start with a naive comparison, checking if metabolites are consistently being discovered using a simple \(p=0.001\) threshold for significance.
# Naive comparison
thr = 0.001
naive_analysis_tables = lapply(meta_analysis_stats,
function(x)table(x$curr_met_data[,"Pvalue"]<thr,
x$curr_met_data[,"is_targeted"]))
table_with_sig_results = naive_analysis_tables[
sapply(naive_analysis_tables,nrow)>1
]
percent_with_sig =
length(table_with_sig_results)/length(naive_analysis_tables)
sig_results = sapply(table_with_sig_results,
function(x)paste(x["TRUE",c("0","1")]>0,collapse=","))
sig_results_counts = table(sig_results)
print("Counting the number of metabolites with p<0.001")
## [1] "Counting the number of metabolites with p<0.001"
print(paste("Significant in targeted only:",sig_results_counts["FALSE,TRUE"]))
## [1] "Significant in targeted only: 66"
print(paste("Significant in untargeted only:",sig_results_counts["TRUE,FALSE"]))
## [1] "Significant in untargeted only: 147"
print(paste("Significant in both:",sig_results_counts["TRUE,TRUE"]))
## [1] "Significant in both: 90"
As a more rigorous analysis, we use the meta-analyses to obtain useful statistics for comparison.
# Extract some useful statistics per analysis
# P-value for the difference between targeted and untargeted
targeted_diff_p =
sapply(meta_analysis_stats,function(x)x$re_model2$pval[2])
# P-values - targeted vs. untargeted
pvals_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$pval)
pvals_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$pval)
pvals_untar = unlist(pvals_untar[sapply(pvals_untar,length)>0])
significant_in = rep("None",length(pvals_untar))
significant_in[pvals_tar<0.001] = "Targeted"
significant_in[pvals_untar<0.001] = "Untargeted"
significant_in[pvals_tar<0.001 & pvals_untar<0.001] = "Both"
significant_diff = targeted_diff_p<0.001
df = data.frame(
targeted = -log10(pvals_tar),
untargeted = -log10(pvals_untar),
significant_in = significant_in,
significant_diff = significant_diff
)
rho = cor(pvals_tar,pvals_untar,method = "spearman")
rhop = cor.test(pvals_tar,pvals_untar,method = "spearman")$p.value
print(
ggplot(df, aes(x=targeted, y=untargeted,
shape=significant_diff, color=significant_in)) +
geom_point() +
ggtitle(paste("-log10 p-values, spearman:",format(rho,digits=2),
"(p=",format(rhop,digits=3),")"))
)
print("### Summary of differences in RE models ###")
## [1] "### Summary of differences in RE models ###"
print("Model is significant at p<0.001:")
## [1] "Model is significant at p<0.001:"
print(table(df$significant_in))
##
## Both None Targeted Untargeted
## 219 2082 117 228
print("Adding is_targeted as a covariate has p<0.001:")
## [1] "Adding is_targeted as a covariate has p<0.001:"
print(table(df$significant_diff))
##
## FALSE TRUE
## 2425 221
# Betas - targeted vs. untargeted
betas_tar = sapply(meta_analysis_stats,function(x)x$re_model_tar$beta[1,1])
betas_untar = sapply(meta_analysis_stats,function(x)x$re_model_untar$beta[1,1])
betas_untar = unlist(betas_untar[sapply(betas_untar,length)>0])
df = data.frame(
targeted = betas_tar,
untargeted = betas_untar,
significant_in = significant_in,
significant_diff = significant_diff
)
rho = cor(betas_untar,betas_tar,method = "spearman")
rhop = cor.test(betas_untar,betas_tar,method = "spearman")$p.value
print(
ggplot(df, aes(x=targeted, y=untargeted,
shape=significant_diff, color=significant_in)) +
geom_point() +
ggtitle(paste("Effect sizes, spearman:",format(rho,digits=2),
"(p=",format(rhop,digits=3),")"))
)
From the plots above we take the most extreme examples and examine their forest plots.
agree_example = names(sample(which(pvals_tar< 1e-10 & pvals_untar < 1e-10 &
targeted_diff_p > 0.1))[1])
forest(meta_analysis_stats[[agree_example]]$re_model1,
slab = meta_analysis_stats[[agree_example]][[1]][,1],
main = paste(agree_example,"significant in both, tar and untar agree",sep="\n"),
xlab = "Log fc",col = "blue")
agree_p_disagree_beta = names(sample(which(pvals_tar< 1e-10 & pvals_untar < 1e-10 &
targeted_diff_p < 0.001))[1])
forest(meta_analysis_stats[[agree_p_disagree_beta]]$re_model1,
slab = meta_analysis_stats[[agree_p_disagree_beta]][[1]][,1],
main = paste(agree_p_disagree_beta,
"significant in both, tar and untar disagree",sep="\n"),
xlab = "Log fc",col = "blue")
disagree_example1 = names(sample(which(pvals_tar< 1e-20 & pvals_untar >0.1))[1])
forest(meta_analysis_stats[[disagree_example1]]$re_model1,
slab = meta_analysis_stats[[disagree_example1]][[1]][,1],
main = paste(disagree_example1,
"significant in both, tar and untar agree",sep="\n"),
xlab = "Log fc",col = "blue")
disagree_example2 = names(sample(which(pvals_tar > 0.1 & pvals_untar < 1e-20))[1])
forest(meta_analysis_stats[[disagree_example2]]$re_model1,
slab = meta_analysis_stats[[disagree_example2]][[1]][,1],
main = paste(disagree_example2,
"significant in both, tar and untar agree",sep="\n"),
xlab = "Log fc",col = "blue")
Use 10-fold cross validation for analysis within tissues.
nfolds = 10
prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
nn1_tissue = strsplit(nn1,split=",")[[1]][1]
nn1_tissue = gsub("_powder","",nn1_tissue)
for(nn2 in names(metabolomics_processed_datasets)){
nn2_tissue = strsplit(nn2,split=",")[[1]][1]
nn2_tissue = gsub("_powder","",nn2_tissue)
if(nn1_tissue!=nn2_tissue){next}
print(paste("training set:",nn2))
print(paste("test set:",nn1))
# get the data, merge samples by bid if necessary
y = metabolomics_processed_datasets[[nn1]]$sample_data
x = metabolomics_processed_datasets[[nn2]]$sample_data
y_meta = unique(metabolomics_processed_datasets[[nn1]]$sample_meta_parsed)
# # remove metadata variables with too many NAs
# na_counts = apply(is.na(y_meta),2,sum)
# y_meta = y_meta[,na_counts/nrow(y_meta) == 0]
rownames(y_meta) = y_meta$bid
bid_y = merged_dmaqc_data[colnames(y),"bid"]
bid_x = merged_dmaqc_data[colnames(x),"bid"]
# merge samples from the same BID
if(length(unique(bid_x))!=length(bid_x)){
x = aggregate_repeated_samples(x,bid_x)
}
else{
colnames(x) = bid_x
}
if(length(unique(bid_y))!=length(bid_y)){
y = aggregate_repeated_samples(y,bid_y)
}else{
colnames(y) = bid_y
}
if(ncol(y)>1000){next}
cov_cols = intersect(colnames(y_meta),
setdiff(biospec_cols,"bid"))
covs = as.matrix(y_meta[colnames(y),cov_cols])
# regress the covariates out - simple linear analysis
y = lm_regress_out_matrix(t(y),covs)
# Prepare the input for the ML part
shared_bids = as.character(intersect(rownames(y),colnames(x)))
x = t(as.matrix(x[,shared_bids]))
y = as.matrix(y[shared_bids,])
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
y_i = y[,1]
i_preds = c();i_real=c()
for(j in 1:nfolds){
print(j)
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
# model = randomForest(tr_yi,x=tr_x,ntree = 20)
# te_preds = predict(model,newdata = te_x)
model = feature_selection_wrapper(tr_x,tr_yi,
coeff_of_var,randomForest,
topK = numFeatures,ntree=50)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
currname = paste(nn1,nn2,sep=";")
prediction_analysis_results[[currname]] = list(
preds = preds,real=real
)
}
}
names(prediction_analysis_results)
cov_prediction_analysis_results = list()
for(nn1 in names(metabolomics_processed_datasets)){
print(nn1)
y = metabolomics_processed_datasets[[nn1]]$sample_data
y_vials = colnames(y)
bid_y = merged_dmaqc_data[colnames(y),"bid"]
colnames(y) = bid_y
y = t(as.matrix(y))
if(ncol(y)>1000){next}
cov_cols = c("animal.registration.sex",
"acute.test.weight",
"acute.test.distance",
"animal.key.timepoint")
covs = merged_dmaqc_data[y_vials,cov_cols]
x = covs
# Run the regressions
folds = sample(rep(1:nfolds,(1+nrow(x)/nfolds)))[1:nrow(x)]
numFeatures = min(ncol(x),2000)
preds = c();real=c()
for(i in 1:ncol(y)){
y_i = y[,1]
i_preds = c();i_real=c()
for(j in 1:nfolds){
print(j)
tr_x = x[folds!=j,]
tr_yi = y_i[folds!=j]
te_x = x[folds==j,]
te_y = y_i[folds==j]
# random forest
model = randomForest(tr_yi,x=tr_x,ntree = 20)
te_preds = predict(model,newdata = te_x)
i_preds = c(i_preds,te_preds)
i_real = c(i_real,te_y)
}
preds = cbind(preds,i_preds)
real = cbind(real,i_real)
}
cov_prediction_analysis_results[[nn1]] = list(
preds = preds,real=real
)
}
results_metrics = c()
for(nn in names(prediction_analysis_results)){
preds = prediction_analysis_results[[nn]]$preds
real = prediction_analysis_results[[nn]]$real
rhos1 = diag(cor(preds,real))
rhos2 = diag(cor(preds,real,method="spearman"))
SEs = (preds-real)^2
mse = mean(SEs)
normSEs = SEs / apply(real,2,sd)
curr_scores = c(mean(rhos1),mean(rhos2),min(rhos1),min(rhos2),
mse,mean(normSEs))
names(curr_scores) = c("mean_rho","mean_spearman_rho","min_rho","min_spearman_rho",
"MSE","mean MSE/SD")
results_metrics = rbind(results_metrics,
curr_scores)
rownames(results_metrics)[nrow(results_metrics)] = nn
}
cov_results_metrics = c()
for(nn in names(cov_prediction_analysis_results)){
preds = cov_prediction_analysis_results[[nn]]$preds
real = cov_prediction_analysis_results[[nn]]$real
rhos1 = diag(cor(preds,real))
rhos2 = diag(cor(preds,real,method="spearman"))
SEs = (preds-real)^2
mse = mean(SEs)
normSEs = SEs / apply(real,2,sd)
curr_scores = c(mean(rhos1),mean(rhos2),min(rhos1),min(rhos2),
mse,mean(normSEs))
names(curr_scores) = c("mean_rho","mean_spearman_rho","min_rho","min_spearman_rho",
"MSE","mean MSE/SD")
cov_results_metrics = rbind(cov_results_metrics,
curr_scores)
rownames(cov_results_metrics)[nrow(cov_results_metrics)] = nn
}
# Some boxplots
pred_targets = sapply(names(prediction_analysis_results),function(x)
strsplit(x,split = ";")[[1]][1])
target = "liver_powder,metab_t_tca,named"
curr_res = unique(results_metrics[pred_targets == target,])
rownames(curr_res) = gsub(target,"",rownames(curr_res))
rownames(curr_res) = gsub(";","",rownames(curr_res))
rownames(curr_res)[rownames(curr_res)==""] = target
cov_baseline = cov_results_metrics[target,]
par(mar = c(8,4,2,2))
cols = rep("blue",nrow(curr_res))
cols[rownames(curr_res)==target] = "black"
plt = barplot(curr_res[,4],beside = T,xaxt="n",legend=F,
ylab="Min rho (Spearman)",
ylim = c(0.5,1),xpd=F,col=cols,
main = target)
text(plt, par("usr")[3], labels = rownames(curr_res),
srt = 45, adj = c(1.1,1.1), xpd = T, cex=0.6)
abline(h=cov_baseline[4],lty=2,lwd=2,col="red")
par(mar = c(8,4,2,2))
plt = barplot(curr_res[,6],beside = T,xaxt="n",legend=F,
ylab="Mean standardized SE",xpd=F,
main = target,col=cols)
text(plt, par("usr")[3], labels = rownames(curr_res),
srt = 45, adj = c(1.1,1.1), xpd = T, cex=0.6)
abline(h=cov_baseline[6],lty=2,lwd=2,col="red")
# preds = c();real=c()
# for(j in 1:nfolds){
# tr_x = x[folds!=j,]
# tr_y = y[folds!=j,]
# te_x = x[folds==j,]
# te_y = y[folds==j,]
# model = MTL_wrapper(tr_x,tr_y,type="Regression", Regularization="L21")
# te_preds = predict(model,te_x)
# real = rbind(real,te_y)
# preds = rbind(preds,te_preds)
# }
# diag(cor(preds,real))
# Using PLS regression
# library(pls)
# pls_model = plsr(y~x,ncomp = 5,validation="LOO")
# eval = MSEP(pls_model)
#
# y_pca = prcomp(y)
# plot(y_pca)
# explained_var = y_pca$sdev^2/sum(y_pca$sdev^2)
# y_pca_matrix = y_pca$x[,1:10]
#
# # regress out sex, weight
#
# get_explained_variance_using_PCA(x,y)
# x = apply(x,2,regress_out,covs=covs)
# y = apply(y,2,regress_out,covs=covs)
# get_explained_variance_using_PCA(x,y)